
/**********************************************************************/
/*
   Author: Robbie Dulin, modified by Nikhil and Michelle
   Created: 11 June 2021, modified on 27 September, 2022
   Description: Checks and cleans PMO data on batches.
   Note that this code works with PII data (not deidentified)
	 PMO_merge_batch_1_22. runs before this to produce all_batches_raw_1-22.dta
   Note: to set filepaths, run MASTER.do.

   Outputs:
   all_batches_cleaned_1-22.dta
   all_batches_cleaned_wide_1-22.dta

*/
/**********************************************************************/
  	* Set Filepaths
	if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
	* Log
	cap log close
	local prefix: display %tdCYND td(`c(current_date)')
	log using "$KP_logs/${prefix}_PMO_clean_wide_b1-22.txt", replace text

/*---------------------------------------------------*/
                  /* Section: Checks */
/*----------------------------------------------------*/

	use "$KP_deid_admin/Clean/all_batches_raw_1-22.dta", clear

* Check old and new has_passed_current_batch
	gen has_passed_current_batch_diff = 0
	replace has_passed_current_batch_diff = 1 if has_passed_current_batch != has_passed_current_batch_old ///
		& !missing(has_passed_current_batch) & !missing(has_passed_current_batch_old)
	tab has_passed_current_batch_diff

* if missing has_passed_current_batch from new data transfer, replace with old has_passed_current_batch variable
	replace has_passed_current_batch = has_passed_current_batch_old if missing(has_passed_current_batch)

* count number of appearancess
	tab batch, m
	gsort anon_id4 batch
	by anon_id4: gen appearances = _N
	by anon_id4 (batch): gen last_appearance = _n == _N
	tab appearances if last_appearance, m
	gstats summ appearances if last_appearance

* check missings
	replace hh_id = "" if hh_id == "."
	count if hh_id == ""
	di `r(N)' / _N
	assert anon_id4 != .
	assert batch != .
	assert has_passed_current_batch != .
	count if code_domisili == ""
	di `r(N)' / _N
	count if userid_bps == ""
	di `r(N)' / _N

* generate variables for province and city codes
	replace province_code = substr(code_domisili, 1, 2) if batch <= 17
	replace city_code = substr(code_domisili, 3, 2) if batch <= 17
	gen area_code = province_code + city_code
	
* Generate dummy for being on Java
	destring province_code, replace
	gen java = inrange(province_code, 31, 36)

* missing area codes
	gen area_miss = area_code == ""
	tab batch area_miss, row

	forval batch = 1/22 {
		di "batch = `batch':"
		tab area_miss has_passed_current_batch if batch == `batch', col
	}

* check demographic vars are constant
	foreach var of varlist year_dob month_dob gender education area_code hh_id test_score {
		gsort anon_id4 `var'
		by anon_id4 (`var'): gen tag_`var' = `var'[1] != `var'[_N] if appearances > 1
		tab tag_`var'
	}

* list anon_id4 batch education if tag_education == 1
	tab tag_hh_id gender, col

* check that if user gets accepted, no longer applies
* NOTE: all the problematic observations are in batches 1-3
	gsort anon_id4 batch
	by anon_id4 (batch): gen appearance_num = _n
	gen select_batch_not_last = appearance_num != appearances if has_passed_current_batch == 1
	tab select_batch_not_last

	gsort anon_id4 batch
	by anon_id4: egen apply_after_selected = total(select_batch_not_last)
	tab apply_after_selected
	tab batch if apply_after_selected == 1
* list anon_id4 batch has_passed_current_batch if apply_after_selected == 1, clean noobs

* check number of individuals applying from same HH
	gsort hh_id batch
	by hh_id batch: gen num_apply_in_batch_hh = _N if hh_id != ""
	gegen unique_applicants_hh = tag(hh_id anon_id4)
	by hh_id: gegen total_apply_hh = total(unique_applicants_hh)

	preserve
	gsort hh_id batch
	by hh_id batch: gen ord = _n
	drop if ord > 1
	tab num_apply_in_batch_hh

	forval i = 1 / 22 {
		di "Batch: `i':"
		tab num_apply_in_batch_hh if batch == `i'
	}

	by hh_id : replace ord = _n
	drop if ord > 1
	tab total_apply_hh
	restore

** Batch wins
* does individual win by via old variable, before new data transfer
	gsort anon_id4
	by anon_id4: gegen ever_win_22 = total(has_passed_current_batch)
	by anon_id4: gegen ever_win_17 = total(has_passed_current_batch) if batch <= 17
	gen ever_win_39 = 0
	replace ever_win_39 = 1 if !missing(batch_sk)
	by anon_id4 : gen ord = _n

	la var ever_win_22 "Treated by Batch 22"
	la var ever_win_17 "Treated by Batch 17"
	la var ever_win_39 "Treated by Batch 39"
	tab ever_win_22 if ord == 1
	tab ever_win_17 if ord == 1

* does HH ever win by batch 17
	gsort hh_id
	by hh_id : replace ord = _n
	by hh_id: gegen hh_num_wins_17 = total(has_passed_current_batch) if hh_id != "" & batch <= 17
	tab hh_num_wins_17 if ord == 1
	gen hh_ever_win_17 = hh_num_wins_17 > 0 if !missing(hh_num_wins_17)
	tab hh_ever_win_17 if ord == 1

* does HH ever win by batch 22
	gsort hh_id
	by hh_id : replace ord = _n
	by hh_id: gegen hh_num_wins_22 = total(has_passed_current_batch) if hh_id != ""
	tab hh_num_wins_22 if ord == 1
	gen hh_ever_win_22 = hh_num_wins_22 > 0 if !missing(hh_num_wins_22)
	tab hh_ever_win_22 if ord == 1

* is this the first batch individual applies in?
	gsort anon_id4
	by anon_id4: gegen first_apply_batch = min(batch)
	by anon_id4 : replace ord = _n
	gen first_apply = batch == first_apply_batch
	tab first_apply_batch if ord == 1
	tab batch first_apply, row

* does HH win in this batch?
	gsort hh_id batch
	by hh_id : replace ord = _n
	by hh_id batch: gegen hh_wins_in_batch = total(has_passed_current_batch) if hh_id != ""
	tab batch hh_wins_in_batch if ord == 1, row

	preserve
	gsort hh_id batch
	by hh_id batch: replace ord = _n
	keep if ord == 1
	tab batch hh_wins_in_batch
	restore

	gen hh_win_in_batch = hh_wins_in_batch > 0 if !missing(hh_wins_in_batch)
	tab batch hh_win_in_batch if ord == 1, row

	drop tag_* appearance* select_batch_not_last unique_applicants_hh num_apply_in_batch_hh ///
		total_apply_hh ord has_passed_current_batch_diff

	gsort anon_id4 batch

** age: batch open date minus birth date
* batch opens
	gen batch_open 		  = td("11apr2020") if batch == 1
	replace batch_open 	= td("21apr2020") if batch == 2
	replace batch_open 	= td("27apr2020") if batch == 3
	replace batch_open 	= td("8aug2020")  if batch == 4
	replace batch_open 	= td("15aug2020") if batch == 5
	replace batch_open 	= td("27aug2020") if batch == 6
	replace batch_open 	= td("3sep2020")  if batch == 7
	replace batch_open 	= td("10sep2020") if batch == 8
	replace batch_open 	= td("17sep2020") if batch == 9
	replace batch_open 	= td("26sep2020") if batch == 10
	replace batch_open 	= td("2nov2020")  if batch == 11
	replace batch_open 	= td("23feb2021") if batch == 12
	replace batch_open 	= td("4mar2021")  if batch == 13
	replace batch_open 	= td("11mar2021") if batch == 14
	replace batch_open 	= td("18mar2021") if batch == 15
	replace batch_open 	= td("25mar2021") if batch == 16
	replace batch_open 	= td("5jun2021")  if batch == 17
	replace batch_open 	= td("16aug2021")  if batch == 18
	replace batch_open 	= td("26aug2021") if batch == 19
	replace batch_open 	= td("9sep2021") if batch == 20
	replace batch_open 	= td("16sep2021") if batch == 21
	replace batch_open 	= td("25oct2021")  if batch == 22

* birth date (approximating to middle of month)
	gen birth_date = mdy(month_dob, 15, year_dob)
	format batch_open %td
	format birth_date %td

	cap drop age
	gen age = floor((batch_open - birth_date)/365.25)

	gen batch_year = 2020 if batch <= 11
	replace batch_year = 2021 if batch >= 12

* Label variables
	la var has_passed_current_batch "Indicator for having passed current batch (additional variables data transfer, Sep '22)"
	la var batch_sk "Batch that respondent was treated, up to batch 39 (additional variables data transfer, Sep '22)"
	la var date_batch "Date that applicant was announced winner of lottery (additional variables data transfer, Sep '22)"
	la var date_incentive" Date that applicant received cash transfer (additional variables data transfer, Sep '22)"
	la var status_revoked "Flag for having status revoked due to a) not registering for a course on time or b) received other social assistance programs (additional variables data transfer, Sep '22)"

* Save data
	gsort anon_id4 batch
	order anon_id4 hh_id userid_bps batch gender has_passed_current_batch
	save "$KP_deid_admin/Clean/all_batches_cleaned_1-22.dta", replace

/*----------------------------------------------------*/
               /* Section: Wide Data */
/*----------------------------------------------------*/

* drop users that apply after being selected
// NOTE: may change handling of this later
	drop if apply_after_selected == 1
	gsort anon_id4 userid_bps

* reshape wide
	gen applied = 1
	rename aaa* aaa*_
	rename ever_win_* ever_win_*_
	rename hh_ever_win_* hh_ever_win_*_
	compress

* Part 1
	preserve
	keep applied ///
	     userid_bps ///
	     hh_id ///
	     area_code ///
	     province_code ///
	     ever_win_*_ ///
	     hh_ever_win_*_ ///
	     has_passed_current_batch ///
	     anon_id4 ///
	     batch

	greshape wide ///
		 applied ///
		 userid_bps ///
		 hh_id ///
		 area_code ///
		 province_code ///
		 ever_win_*_ ///
		 hh_ever_win_*_ ///
		 has_passed_current_batch, i(anon_id4) j(batch) nochecks benchmark

	save "$KP_deid_admin/Clean/all_batches_cleaned_wide_1-22_part1.dta", replace
	restore

* Part 2
	preserve
	keep has_passed_current_batch_old ///
	     hh_win_in_batch ///
	     area_miss ///
	     first_apply_batch ///
	     bobot ///
	     year_dob ///
	     month_dob ///
	     gender ///
	     education ///
	     test_score ///
	     anon_id4 ///
	     batch

	greshape wide ///
		 has_passed_current_batch_old ///
		 hh_win_in_batch ///
		 area_miss ///
		 first_apply_batch ///
		 bobot ///
		 year_dob ///
		 month_dob ///
		 gender ///
		 education ///
		 test_score, i(anon_id4) j(batch) nochecks benchmark

	save "$KP_deid_admin/Clean/all_batches_cleaned_wide_1-22_part2.dta", replace
	restore


* Part 3
	preserve
	keep aaa*_ ///
	     anon_id4 ///
	     batch

	greshape wide ///
		 aaa*_, i(anon_id4) j(batch) nochecks benchmark

	save "$KP_deid_admin/Clean/all_batches_cleaned_wide_1-22_part3.dta", replace
	restore


* Merge together
	u "$KP_deid_admin/Clean/all_batches_cleaned_wide_1-22_part1.dta", clear
	merge 1:1 anon_id4 using "$KP_deid_admin/Clean/all_batches_cleaned_wide_1-22_part2.dta", nogen
	merge 1:1 anon_id4 using "$KP_deid_admin/Clean/all_batches_cleaned_wide_1-22_part3.dta", nogen

* keep only first copy of duplicated variables
	foreach var in area_miss ever_win_17_ ever_win_22_ ever_win_39_ first_apply_batch year_dob month_dob gender ///
		test_score education hh_ever_win_17_ hh_ever_win_22_ aaa1_ aaa2_ aaa3_ aaa4_ aaa5_ aaa6_ aaa7_ ///
		aaa8_ aaa9_ aaa20_ aaa21_ aaa22_ aaa23_ aaa24_ aaa25_ aaa26_ aaa27_ aaa31_ aaa32_ aaa33_ aaa34_ aaa35_ aaa36_ aaa37_ aaa38_ {
		  egen `var' = rowfirst(`var'*)
		  drop `var'?*
	}

	rename aaa?_ aaa?
	rename aaa*_ aaa*
	rename hh_ever_win_*_ hh_ever_win_*
	rename ever_win_*_ ever_win_*
	
	gen area_code = ""
	gen province_code = .
	gen hh_id = ""
	gen userid_bps = ""
	gen win_batch = .

	qui forval i = 1 / 22 {
		replace area_code = area_code`i' if first_apply_batch == `i'
		replace province_code = province_code`i' if first_apply_batch == `i'
		replace hh_id = hh_id`i' if first_apply_batch == `i'
		replace userid_bps = userid_bps`i' if first_apply_batch == `i'
		replace win_batch = `i' if has_passed_current_batch`i' == 1
	}

	drop hh_id?* area_code?* userid_bps?* province_code?*
	rename province_code prov_id
	rename has_passed_current_batch* win_in_batch*

* Label variables
	la var area_miss "Missing Area Code"
	la var ever_win_17 "Ever Won Prakerja, by Batch 17"
	la var ever_win_22 "Ever Won Prakerja, by Batch 22"
	la var ever_win_39 "Ever Won Prakerja, by Batch 39"
	la var first_apply_batch "Batch of First Application"
	la var year_dob "Year of Birth"
	la var month_dob "Month of Birth"
	la var gender "Gender"
	la var test_score "Score on Skills Test"
	la var education "Education level"
	la var hh_ever_win_17 "Household Ever Won Prakerja, by Batch 17"
	la var hh_ever_win_22 "Household Ever Won Prakerja, by Batch 22"
	la var aaa1 "[For 2020 registrants] Are you unemployed at the moment?"
	la var aaa2 "[For 2020 registrants] Are you an employee?"
	la var aaa3 "[For 2020 registrants] Have you ever worked before?"
	la var aaa4 "[For 2020 registrants] Are you unemployed due to Covid-19?"
	la var aaa5 "[For 2020 registrants] Do you have job contract?"
	la var aaa6 "[For 2020 registrants] Are you self-employed?"
	la var aaa7 "[For 2020 registrants] Covid-19 impact: decrease revenue, less customer?"
	la var aaa8 "[For 2020 registrants] Is your business closed temporarily due to government advice?"
	la var aaa9 "[For 2020 registrants] Is your business closed temporarily because you cannot pay your employee?"
	la var aaa20 "[For 2021, semester 1 registrants] Are you unemployed at the moment?"
	la var aaa21 "[For 2021 semester 1 registrants] Are you an employee?"
	la var aaa22 "[For 2021 semester 1 registrants] Have you ever worked before?"
	la var aaa23 "[For 2021 semester 1 registrants] Are you self-employed?"
	la var aaa24 "[For 2021 semester 1 registrants] Do you have job contract?"
	la var aaa25 "[For 2021 semester 1 registrants] Are you unemployed due to Covid-19?"
	la var aaa26 "[For 2021 semester 1 registrants] Covid-19 impact: decrease revenue, less customer?"
	la var aaa27 "[For 2021 semester 1 registrants] Does your working hour decrease due to Covid-19?"
	la var aaa31 "[For 2021 semester 2 registrants] Are you unemployed at the moment?"
	la var aaa32 "[For 2021 semester 2 registrants] Are you an employee?"
	la var aaa33 "[For 2021 semester 2 registrants] Have you ever worked before?"
	la var aaa34 "[For 2021 semester 2 registrants] Are you self-employed?"
	la var aaa35 "[For 2021 semester 2 registrants] Do you have job contract?"
	la var aaa36 "[For 2021 semester 2 registrants] Are you unemployed due to Covid-19?"
	la var aaa37 "[For 2021 semester 2 registrants] Covid-19 impact: decrease revenue, less customer?"
	la var aaa38 "[For 2021 semester 2 registrants] Does your working hour decrease due to Covid-19?"
	la var area_code "Area Code"
	la var prov_id "Province ID"
	la var hh_id "Household ID"
	la var win_batch "Batch that Respondent Won Prakerja (by batch 22)"
	la var userid_bps "BPS User ID"
	* la var java "Dummy for being on Java"
	
* Save data
	compress
	save "$KP_deid_admin/Clean/all_batches_cleaned_wide_1-22.dta", replace


cap log close


// DONE
